Centralize cleanup

This script combines all the cleanup that needs done to bring it into one central place. (Had too many cleanup scripts in too many places, prior to this.)

Macro data

This was (mostly) originally in model2.Rmd.

The data is in three files. The macro(economic) data is something to add to both the test and the train records by joining on the timestamp. It makes more sense to do any cleanup of the macro records before joining, however, so will start with that.

Macro EDA

input_dir = '../input'
predict_col <- 'price_doc'
macro <- read.csv(input_filename('macro.csv'))
macro_drop_cols <- c()   # expect to find some columns to omit.

macro$timestamp <- as.POSIXct(as.character(macro$timestamp))
qplot(timestamp, oil_urals, data=macro, geom='line')

qplot(timestamp, gdp_quart, data=macro, geom='line')
## Warning: Removed 90 rows containing missing values (geom_path).

qplot(timestamp, cpi, data=macro, geom='line')
## Warning: Removed 31 rows containing missing values (geom_path).

qplot(timestamp, ppi, data=macro, geom='line')
## Warning: Removed 31 rows containing missing values (geom_path).

qplot(timestamp, gdp_deflator, data=macro, geom='line')
## Warning: Removed 365 rows containing missing values (geom_path).

qplot(timestamp, balance_trade, data=macro, geom='line')
## Warning: Removed 31 rows containing missing values (geom_path).

qplot(timestamp, usdrub, data=macro, geom='line')
## Warning: Removed 3 rows containing missing values (geom_path).

qplot(timestamp, eurrub, data=macro, geom='line')
## Warning: Removed 3 rows containing missing values (geom_path).

qplot(timestamp, brent, data=macro, geom='line')
## Warning: Removed 3 rows containing missing values (geom_path).

qplot(timestamp, net_capital_export, data=macro, geom='line')
## Warning: Removed 396 rows containing missing values (geom_path).

qplot(timestamp, gdp_annual, data=macro, geom='line')

qplot(timestamp, gdp_annual_growth, data=macro, geom='line')

qplot(timestamp, average_provision_of_build_contract, data=macro, geom='line')

qplot(timestamp, average_provision_of_build_contract_moscow, data=macro, geom='line')
## Warning: Removed 365 rows containing missing values (geom_path).

qplot(timestamp, rts, data=macro, geom='line')
## Warning: Removed 10 rows containing missing values (geom_path).

qplot(timestamp, micex, data=macro, geom='line')
## Warning: Removed 10 rows containing missing values (geom_path).

qplot(timestamp, deposits_value, data=macro, geom='line')

qplot(timestamp, deposits_growth, data=macro, geom='line')
## Warning: Removed 31 rows containing missing values (geom_path).

qplot(timestamp, deposits_rate, data=macro, geom='line')
## Warning: Removed 414 rows containing missing values (geom_path).

qplot(timestamp, mortgage_value, data=macro, geom='line')

qplot(timestamp, mortgage_growth, data=macro, geom='line')
## Warning: Removed 365 rows containing missing values (geom_path).

qplot(timestamp, mortgage_rate, data=macro, geom='line')

qplot(timestamp, grp, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, grp_growth, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).

qplot(timestamp, income_per_cap, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, real_dispos_income_per_cap_growth, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, salary, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, salary_growth, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, fixed_basket, data=macro, geom='line')

qplot(timestamp, retail_trade_turnover, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, retail_trade_turnover_per_cap, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, retail_trade_turnover_growth, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, labor_force, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, unemployment, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, employment, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, invest_fixed_capital_per_cap, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, invest_fixed_assets, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, profitable_enterpr_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, unprofitable_enterpr_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, share_own_revenues, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, overdue_wages_per_cap, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, fin_res_per_cap, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, marriages_per_1000_cap, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, divorce_rate, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, construction_value, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, invest_fixed_assets_phys, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, pop_natural_increase, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, pop_migration, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, pop_total_inc, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, childbirth, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, mortality, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, housing_fund_sqm, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, lodging_sqm_per_cap, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, water_pipes_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

# has just two values. Drop as likely noise
macro_drop_cols <- c(macro_drop_cols, 'water_pipes_share')
qplot(timestamp, baths_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

# has just two values. Drop as likely noise
macro_drop_cols <- c(macro_drop_cols, 'baths_share')
qplot(timestamp, sewerage_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, gas_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, hot_water_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

# has just three values. Drop as likely noise
macro_drop_cols <- c(macro_drop_cols, 'hot_water_share')
qplot(timestamp, electric_stove_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, heating_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

# has just two values. Drop as likely noise
macro_drop_cols <- c(macro_drop_cols, 'heating_share')
qplot(timestamp, old_house_share, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

# has just two values. Drop as likely noise
macro_drop_cols <- c(macro_drop_cols, 'old_house_share')
qplot(timestamp, average_life_exp, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, infant_mortarity_per_1000_cap, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, perinatal_mort_per_1000_cap, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, incidence_population, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, rent_price_4.room_bus, data=macro, geom='line')
## Warning: Removed 273 rows containing missing values (geom_path).

qplot(timestamp, rent_price_3room_bus, data=macro, geom='line')
## Warning: Removed 273 rows containing missing values (geom_path).

qplot(timestamp, rent_price_2room_bus, data=macro, geom='line')
## Warning: Removed 273 rows containing missing values (geom_path).

qplot(timestamp, rent_price_1room_bus, data=macro, geom='line')
## Warning: Removed 273 rows containing missing values (geom_path).

qplot(timestamp, rent_price_3room_eco, data=macro, geom='line')
## Warning: Removed 273 rows containing missing values (geom_path).

qplot(timestamp, rent_price_2room_eco, data=macro, geom='line')
## Warning: Removed 273 rows containing missing values (geom_path).

# problem with Feb 2013.
qplot(timestamp, rent_price_2room_eco, data=subset(macro, timestamp >='2013-01-01' & timestamp <= '2013-04-01'), geom='line')

qplot(timestamp, rent_price_1room_eco, data=macro, geom='line')
## Warning: Removed 273 rows containing missing values (geom_path).

# problem with May 2013... or what looks like one.
qplot(timestamp, rent_price_1room_eco, data=subset(macro, timestamp >='2013-02-01' & timestamp <= '2013-07-01'), geom='line')

qplot(timestamp, load_of_teachers_preschool_per_teacher, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).

qplot(timestamp, child_on_acc_pre_school, data=macro, geom='line')

# Have a #! level.
qplot(timestamp, load_of_teachers_school_per_teacher, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, students_state_oneshift, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, modern_education_share, data=macro, geom='line')

# three values, mostl NA. drop.
macro_drop_cols <- c(macro_drop_cols, 'modern_education_share')
qplot(timestamp, old_education_build_share, data=macro, geom='line')

# three values, mostl NA. drop.
macro_drop_cols <- c(macro_drop_cols, 'old_education_build_share')
qplot(timestamp, provision_doctors, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, provision_nurse, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, load_on_doctors, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, power_clinics, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, hospital_beds_available_per_cap, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).

qplot(timestamp, hospital_bed_occupancy_per_year, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).

qplot(timestamp, provision_retail_space_sqm, data=macro, geom='line')
## Warning: Removed 1753 rows containing missing values (geom_path).

# two values, drop
macro_drop_cols <- c(macro_drop_cols, 'provision_retail_space_sqm')
qplot(timestamp, provision_retail_space_modern_sqm, data=macro, geom='line')
## Warning: Removed 1754 rows containing missing values (geom_path).

# two values, drop
macro_drop_cols <- c(macro_drop_cols, 'provision_retail_space_modern_sqm')
qplot(timestamp, turnover_catering_per_cap, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, theaters_viewers_per_1000_cap, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).

qplot(timestamp, seats_theather_rfmin_per_100000_cap, data=macro, geom='line')
## Warning: Removed 293 rows containing missing values (geom_path).

qplot(timestamp, museum_visitis_per_100_cap, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).

qplot(timestamp, bandwidth_sports, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, population_reg_sports_share, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).

qplot(timestamp, students_reg_sports_share, data=macro, geom='line')
## Warning: Removed 1023 rows containing missing values (geom_path).

qplot(timestamp, apartment_build, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

qplot(timestamp, apartment_fund_sqm, data=macro, geom='line')
## Warning: Removed 658 rows containing missing values (geom_path).

Macro fixes

Clean up mortgage value to create a new mortgage value montonic column.

Drop a handful of columns with a very small number of values that look uninteresting.

The rent price room eco data also looks like it has a suspicious value or two in it. Remove those to impute them with something more likely.

# mortgage_value clearly resets every Feb 1.
macro$mortgage_value_montonic <- macro$mortgage_value
jan31_value <- 0
for (year in lubridate::year(min(macro$timestamp)):lubridate::year(max(macro$timestamp))) {
    jan31_value <- jan31_value + subset(macro, timestamp==as.POSIXct(sprintf('%d-01-31', year)))$mortgage_value
    idx <- which(macro$timestamp >= as.POSIXct(sprintf('%d-02-01', year)) & macro$timestamp < as.POSIXct(sprintf('%d-02-01', year+1)))
    macro[idx,'mortgage_value_montonic'] <- jan31_value + macro[idx,'mortgage_value_montonic']
}
# macro_drop_cols <- c('water_pipes_share', 'baths_share', 'hot_water_share', 'heating_share', 'old_house_share', 'modern_education_share', 'old_education_build_share', 'provision_retail_space_sqm', 'provision_retail_space_modern_sqm')
for (col in macro_drop_cols) {
    macro[,col] <- NULL
}
idx <- which(macro$rent_price_2room_eco == 0.1)
macro[idx,'rent_price_2room_eco'] <- NA
idx <- which(macro$rent_price_1room_eco == 2.31)
macro[idx,'rent_price_1room_eco'] <- NA
macro[,'child_on_acc_pre_school'] <- as.character(macro[,'child_on_acc_pre_school'])
idx <- which(macro[,'child_on_acc_pre_school'] %in% c('#!'))
macro[idx, 'child_on_acc_pre_school'] <- NA
macro[,'child_on_acc_pre_school'] <- as.numeric(sub(',', '', macro[, 'child_on_acc_pre_school']))


qplot(timestamp, mortgage_value_montonic, data=macro, geom='line')

# confirm we've already removed the near zero variance columns
macro_nzv <- caret::nearZeroVar(macro)
stopifnot(length(macro_nzv) == 0)

Macro imputation

Some modeling algorithms deal poorly with missing values, so impute values where needed. Don’t want to lose the NA-ness entirely however, so add a ‘unknown’ level to factors, and a isna column for each numeric column.

macro_cleaned <- fe_add_isna_col_factory(macro)(macro)  # don't really need to pipeline this
macro_imputer <- caret::preProcess(macro_cleaned, method=c('bagImpute'))
macro_preproc <- predict(macro_imputer, macro_cleaned)

print(sprintf('after imputation, have %d NAs in macro data', sum(is.na(macro_preproc))))
## [1] "after imputation, have 0 NAs in macro data"
macro_pt_cols <- powerTransformCols(macro_preproc)
macro_pt <- caret::preProcess(macro_preproc[,macro_pt_cols], method='YeoJohnson')
macro_preproc <- cbind(sans_cols(macro_preproc, macro_pt_cols), predict(macro_pt, macro_preproc[,macro_pt_cols]))
write.csv(macro_preproc, 'clean_20170614_macro_preproc.csv', row.names=FALSE)
save(macro_preproc, file='clean_20170614_macro_preproc.Rdata')

Test and Train data

Test and train data need to be “fixed” to include the bad address fix, released late in the contest.

Much of the following was originally done in Clean_by_location.Rmd.

Load data

train <- read.csv(input_filename('train.csv'), stringsAsFactors=FALSE)
test <- cbind(read.csv(input_filename('test.csv'), stringsAsFactors=FALSE), price_doc=NA)
stopifnot(all(names(test) == names(train)))
overall_data <- rbind(train, test)
rm(train, test)

bad_address_data <- read.csv(input_filename('BAD_ADDRESS_FIX.csv'), stringsAsFactors=FALSE)



overall_data <- fix_bad_addresses(overall_data, bad_address_data)

Fix column types

overall_data <- fix_raw_column_types(overall_data)

Add location_id and neighborhood_id features?

For exploring location specific attributes it might make sense to create location id and neigborhood id columns. But in general, these high cardinality features get in the way.

if (FALSE) {
    overall_data$location_id <- as.character(openssl::md5(apply(just_cols(overall_data, location_id_features), 1, paste, collapse='|')))
    overall_data$neighborhood_id <- as.character(openssl::md5(apply(just_cols(overall_data, neighborhood_id_features), 1, paste, collapse='|')))
}

Clean up raion-specific features

All the features for a given raion should be the same, so clean those up separately to insure they are imputed consistently, like the macro data.

This was originally done in Raion_Features.Rmd and Raion_Features2.Rmd.

# slim raion data down into one row per raion so it is all imputed consistently.
raion_data <- unique(just_cols(overall_data, c('sub_area', raw_raion_features)))

Fix “all” columns

As mentioned on the forums, the full_all columns are bad.

qplot(raion_popul, full_all, data=raion_data)+geom_abline(slope=1, intercept=0)

Crib from Josep Garriga’s post [https://www.kaggle.com/aralai/full-all-vs-raion-popul/notebook]

raion_data$district <- NA
raion_data[which(raion_data$sub_area %in% c('Krjukovo','Matushkino','Silino','Savelki','Staroe Krjukovo')),"district"]='Górod Zelenograd'
raion_data[which(raion_data$sub_area %in% c(
  'Golovinskoe','Koptevo','Vostochnoe Degunino','Dmitrovskoe','Timirjazevskoe',
  'Hovrino','Zapadnoe Degunino','Beskudnikovskoe','Ajeroport','Vojkovskoe',
  'Savelovskoe','Sokol','Horoshevskoe','Levoberezhnoe','Begovoe','Molzhaninovskoe')),"district"]='Séverny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
  'Mar\'ino','Vyhino-Zhulebino','Ljublino','Kuz\'minki','Rjazanskij','Tekstil\'shhiki',
  'Lefortovo','Pechatniki','Juzhnoportovoe','Nizhegorodskoe','Kapotnja','Nekrasovka')),"district"]='Yugo-Vostochny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
  'Otradnoe','Bibirevo','Severnoe Medvedkovo','Jaroslavskoe','Babushkinskoe','Juzhnoe Medvedkovo',
  'Losinoostrovskoe','Lianozovo','Alekseevskoe','Butyrskoe','Mar\'ina Roshha','Ostankinskoe',
  'Sviblovo','Altuf\'evskoe','Rostokino','Severnoe','Marfino')),"district"]='Sévero-Vostochny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
  'Gol\'janovo','Perovo','Ivanovskoe','Veshnjaki','Bogorodskoe','Novokosino','Izmajlovo',
  'Novogireevo','Sokolinaja Gora','Severnoe Izmajlovo','Preobrazhenskoe','Vostochnoe Izmajlovo',
  'Kosino-Uhtomskoe','Sokol\'niki','Metrogorodok','Vostochnoe')),"district"]='Vostochny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
  'Orehovo-Borisovo Juzhnoe','Birjulevo Vostochnoe','Chertanovo Juzhnoe','Zjablikovo',
  'Orehovo-Borisovo Severnoe','Caricyno','Nagatinskij Zaton','Chertanovo Central\'noe',
  'Chertanovo Severnoe','Brateevo','Danilovskoe','Birjulevo Zapadnoe','Nagornoe',
  'Nagatino-Sadovniki','Moskvorech\'e-Saburovo','Donskoe')),"district"]='Yuzhny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
  'Poselenie Shherbinka','Poselenie Desjonovskoe','Poselenie Vnukovskoe','Poselenie Sosenskoe',
  'Poselenie Voskresenskoe','Poselenie Kokoshkino','Poselenie Moskovskij','Poselenie Rjazanovskoe',
  'Poselenie Marushkinskoe','Poselenie Filimonkovskoe','Poselenie Mosrentgen')),"district"]='Novomoskovsky administrativny okrug'
raion_data[which(raion_data$sub_area %in% c(
  'Troickij okrug','Poselenie Kievskij','Poselenie Voronovskoe','Poselenie Krasnopahorskoe',
  'Poselenie Novofedorovskoe','Poselenie Mihajlovo-Jarcevskoe','Poselenie Rogovskoe',
  'Poselenie Pervomajskoe','Poselenie Klenovskoe','Poselenie Shhapovskoe')),"district"]='Troitsky administrativny okrug'
raion_data[which(raion_data$sub_area %in% c(
  'Juzhnoe Butovo','Jasenevo','Kon\'kovo','Teplyj Stan','Zjuzino','Akademicheskoe',
  'Cheremushki','Severnoe Butovo','Lomonosovskoe','Obruchevskoe','Gagarinskoe','Kotlovka')),"district"]='Yugo-Západny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
  'Kuncevo','Mozhajskoe','Ramenki','Ochakovo-Matveevskoe','Solncevo','Troparevo-Nikulino',
  'Fili Davydkovo','Novo-Peredelkino','Filevskij Park','Krylatskoe','Dorogomilovo',
  'Prospekt Vernadskogo','Vnukovo')),"district"]='Západny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
  'Mitino','Horoshevo-Mnevniki','Severnoe Tushino','Strogino','Shhukino',
  'Juzhnoe Tushino','Pokrovskoe Streshnevo','Kurkino')),"district"]='Sévero-Západny administrativny ókrug'
raion_data[which(raion_data$sub_area %in% c(
  'Presnenskoe','Taganskoe','Basmannoe','Hamovniki','Tverskoe', 'Meshhanskoe',
  'Zamoskvorech\'e','Krasnosel\'skoe','Arbat','Jakimanka')),"district"]='Tsentralny administrativny ókrug'

raion_data$district <- factor(raion_data$district)
raion_data$raion_popul_male <- with(raion_data, young_male+work_male+ekder_male)
raion_data$raion_popul_female <- with(raion_data, young_female+work_female+ekder_female)

district <- raion_data %>%
  group_by(district) %>%
  summarize(full_all=sum(raion_popul),
            male_f=sum(raion_popul_male),
            female_f=sum(raion_popul_female))

raion_data$full_all <- NULL
raion_data$male_f <- NULL
raion_data$female_f <- NULL

raion_data <- merge(raion_data, district, by='district')

Look at the raion data

for (col in names(raion_data)[-1]) {
    x <- raion_data[,col]
    e <- ecdf(x)
    p <- qplot(x, e(x))+ggtitle(sprintf("Distribution of %s across raions", col))
    print(p)
}

## Warning: Removed 22 rows containing missing values (geom_point).

## Warning: Removed 21 rows containing missing values (geom_point).

## Warning: Removed 63 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

## Warning: Removed 15 rows containing missing values (geom_point).

Drop columns with only a few non-NA values

If there were columns where most values were NA and only a few non-NAs, drop those columns. (In fact, there are no such columns.)

non_NAs <- sapply(names(raion_data), function(col) { sum(!is.na(raion_data[,col])) })
raion_data <- sans_cols(raion_data, names(raion_data)[non_NAs < 4])

Handle NAs before making other features

raion_trainers <- c(
    add_is_na=fe_add_isna_col_factory,
    impute=na_impute_factory
    )

Raion feature engineering

Misc ratios

raion_trainers <- c(raion_trainers, misc_fe=function(train_data) {
    return(function(data)
        {
        data$pop_per_kmsq <- with(data, raion_popul / (area_m / 1000 / 1000))
        data$children_preschool_perpop <- with(data, children_preschool / raion_popul)
        data$children_school_perpop <- with(data, children_school / raion_popul)
        data$children_preschool_per_quota <- with(data, inf_to_2max(children_preschool / preschool_quota))
        data$children_school_per_quota <- with(data, inf_to_2max(children_school / school_quota))
        data$children_per_school <- with(data, inf_to_2max(children_school / school_education_centers_raion))
        data$top_20_schools_pct <- with(data, inf_to_2max(ifelse(school_education_centers_top_20_raion > 0, school_education_centers_top_20_raion / school_education_centers_raion, 0)))
        return(data)
    })
})

Population columns

raion_trainers <- c(raion_trainers, population_fe=function(train_data) {
    raion_pop_cols <- setdiff(grep('_all|_male|_female', names(train_data), value=TRUE), 'full_all')
    return (fe_ratios_factory('raion_popul', raion_pop_cols))
})

Building material columns

raion_trainers <- c(raion_trainers, building_materials_fe=function(train_data) {
    raion_building_material_cols <- c(
        "build_count_block",
        "build_count_wood", 
        "build_count_frame",
        "build_count_brick", 
        "build_count_monolith",
        "build_count_panel", 
        "build_count_foam",
        "build_count_slag",
        "build_count_mix"
        )
    return(fe_ratios_factory('raion_build_count_with_material_info', raion_building_material_cols))
})

Building build year columns

raion_trainers <- c(raion_trainers, building_years_fe=function(train_data) {
    raion_building_year_cols <- c(
        "build_count_before_1920",
        "build_count_1921.1945", 
        "build_count_1946.1970",
        "build_count_1971.1995", 
        "build_count_after_1995"
        )
    return (fe_ratios_factory('raion_build_count_with_builddate_info', raion_building_year_cols))
})

Raion with applied trainers and transforms

Apply all of those transformations and save the result.

# district is a problem.
# temporarily remove
dr <- raion_data[,c('sub_area', 'district')]
raion_data_fe <- transform_data(raion_trainers, sans_cols(raion_data, 'district'), verbose=TRUE)
## [1] "applying transform 1, add_is_na to train"
## [1] "after transform class(data)" "data.frame"                 
## [3] "dim(data)"                   "147"                        
## [5] "93"                          "NAs"                        
## [7] "346"                        
## [1] "applying transform 2, impute to train"
##  [1] "imputing on"                               
##  [2] "sub_area"                                  
##  [3] "area_m"                                    
##  [4] "raion_popul"                               
##  [5] "green_zone_part"                           
##  [6] "indust_part"                               
##  [7] "children_preschool"                        
##  [8] "preschool_quota"                           
##  [9] "preschool_education_centers_raion"         
## [10] "children_school"                           
## [11] "school_quota"                              
## [12] "school_education_centers_raion"            
## [13] "school_education_centers_top_20_raion"     
## [14] "hospital_beds_raion"                       
## [15] "healthcare_centers_raion"                  
## [16] "university_top_20_raion"                   
## [17] "sport_objects_raion"                       
## [18] "additional_education_raion"                
## [19] "culture_objects_top_25"                    
## [20] "culture_objects_top_25_raion"              
## [21] "shopping_centers_raion"                    
## [22] "office_raion"                              
## [23] "thermal_power_plant_raion"                 
## [24] "incineration_raion"                        
## [25] "oil_chemistry_raion"                       
## [26] "radiation_raion"                           
## [27] "railroad_terminal_raion"                   
## [28] "big_market_raion"                          
## [29] "nuclear_reactor_raion"                     
## [30] "detention_facility_raion"                  
## [31] "young_all"                                 
## [32] "young_male"                                
## [33] "young_female"                              
## [34] "work_all"                                  
## [35] "work_male"                                 
## [36] "work_female"                               
## [37] "ekder_all"                                 
## [38] "ekder_male"                                
## [39] "ekder_female"                              
## [40] "X0_6_all"                                  
## [41] "X0_6_male"                                 
## [42] "X0_6_female"                               
## [43] "X7_14_all"                                 
## [44] "X7_14_male"                                
## [45] "X7_14_female"                              
## [46] "X0_17_all"                                 
## [47] "X0_17_male"                                
## [48] "X0_17_female"                              
## [49] "X16_29_all"                                
## [50] "X16_29_male"                               
## [51] "X16_29_female"                             
## [52] "X0_13_all"                                 
## [53] "X0_13_male"                                
## [54] "X0_13_female"                              
## [55] "raion_build_count_with_material_info"      
## [56] "build_count_block"                         
## [57] "build_count_wood"                          
## [58] "build_count_frame"                         
## [59] "build_count_brick"                         
## [60] "build_count_monolith"                      
## [61] "build_count_panel"                         
## [62] "build_count_foam"                          
## [63] "build_count_slag"                          
## [64] "build_count_mix"                           
## [65] "raion_build_count_with_builddate_info"     
## [66] "build_count_before_1920"                   
## [67] "build_count_1921.1945"                     
## [68] "build_count_1946.1970"                     
## [69] "build_count_1971.1995"                     
## [70] "build_count_after_1995"                    
## [71] "raion_popul_male"                          
## [72] "raion_popul_female"                        
## [73] "full_all"                                  
## [74] "male_f"                                    
## [75] "female_f"                                  
## [76] "preschool_quota_isna"                      
## [77] "school_quota_isna"                         
## [78] "hospital_beds_raion_isna"                  
## [79] "raion_build_count_with_material_info_isna" 
## [80] "build_count_block_isna"                    
## [81] "build_count_wood_isna"                     
## [82] "build_count_frame_isna"                    
## [83] "build_count_brick_isna"                    
## [84] "build_count_monolith_isna"                 
## [85] "build_count_panel_isna"                    
## [86] "build_count_foam_isna"                     
## [87] "build_count_slag_isna"                     
## [88] "build_count_mix_isna"                      
## [89] "raion_build_count_with_builddate_info_isna"
## [90] "build_count_before_1920_isna"              
## [91] "build_count_1921.1945_isna"                
## [92] "build_count_1946.1970_isna"                
## [93] "build_count_1971.1995_isna"                
## [94] "build_count_after_1995_isna"               
## [1] "imputer is null?" "FALSE"           
## [1] "imputer class" "preProcess"   
## [1] "there are "            "346"                   "NAs in the train data"
##  [1] "in imputation, predict_data has cols"      
##  [2] "sub_area"                                  
##  [3] "area_m"                                    
##  [4] "raion_popul"                               
##  [5] "green_zone_part"                           
##  [6] "indust_part"                               
##  [7] "children_preschool"                        
##  [8] "preschool_quota"                           
##  [9] "preschool_education_centers_raion"         
## [10] "children_school"                           
## [11] "school_quota"                              
## [12] "school_education_centers_raion"            
## [13] "school_education_centers_top_20_raion"     
## [14] "hospital_beds_raion"                       
## [15] "healthcare_centers_raion"                  
## [16] "university_top_20_raion"                   
## [17] "sport_objects_raion"                       
## [18] "additional_education_raion"                
## [19] "culture_objects_top_25"                    
## [20] "culture_objects_top_25_raion"              
## [21] "shopping_centers_raion"                    
## [22] "office_raion"                              
## [23] "thermal_power_plant_raion"                 
## [24] "incineration_raion"                        
## [25] "oil_chemistry_raion"                       
## [26] "radiation_raion"                           
## [27] "railroad_terminal_raion"                   
## [28] "big_market_raion"                          
## [29] "nuclear_reactor_raion"                     
## [30] "detention_facility_raion"                  
## [31] "young_all"                                 
## [32] "young_male"                                
## [33] "young_female"                              
## [34] "work_all"                                  
## [35] "work_male"                                 
## [36] "work_female"                               
## [37] "ekder_all"                                 
## [38] "ekder_male"                                
## [39] "ekder_female"                              
## [40] "X0_6_all"                                  
## [41] "X0_6_male"                                 
## [42] "X0_6_female"                               
## [43] "X7_14_all"                                 
## [44] "X7_14_male"                                
## [45] "X7_14_female"                              
## [46] "X0_17_all"                                 
## [47] "X0_17_male"                                
## [48] "X0_17_female"                              
## [49] "X16_29_all"                                
## [50] "X16_29_male"                               
## [51] "X16_29_female"                             
## [52] "X0_13_all"                                 
## [53] "X0_13_male"                                
## [54] "X0_13_female"                              
## [55] "raion_build_count_with_material_info"      
## [56] "build_count_block"                         
## [57] "build_count_wood"                          
## [58] "build_count_frame"                         
## [59] "build_count_brick"                         
## [60] "build_count_monolith"                      
## [61] "build_count_panel"                         
## [62] "build_count_foam"                          
## [63] "build_count_slag"                          
## [64] "build_count_mix"                           
## [65] "raion_build_count_with_builddate_info"     
## [66] "build_count_before_1920"                   
## [67] "build_count_1921.1945"                     
## [68] "build_count_1946.1970"                     
## [69] "build_count_1971.1995"                     
## [70] "build_count_after_1995"                    
## [71] "raion_popul_male"                          
## [72] "raion_popul_female"                        
## [73] "full_all"                                  
## [74] "male_f"                                    
## [75] "female_f"                                  
## [76] "preschool_quota_isna"                      
## [77] "school_quota_isna"                         
## [78] "hospital_beds_raion_isna"                  
## [79] "raion_build_count_with_material_info_isna" 
## [80] "build_count_block_isna"                    
## [81] "build_count_wood_isna"                     
## [82] "build_count_frame_isna"                    
## [83] "build_count_brick_isna"                    
## [84] "build_count_monolith_isna"                 
## [85] "build_count_panel_isna"                    
## [86] "build_count_foam_isna"                     
## [87] "build_count_slag_isna"                     
## [88] "build_count_mix_isna"                      
## [89] "raion_build_count_with_builddate_info_isna"
## [90] "build_count_before_1920_isna"              
## [91] "build_count_1921.1945_isna"                
## [92] "build_count_1946.1970_isna"                
## [93] "build_count_1971.1995_isna"                
## [94] "build_count_after_1995_isna"               
## [1] 147  93
## [1] "there are "              "346"                    
## [3] "NAs in the predict data"
## [1] "done predicting. Have " "0"                     
## [3] "NAs in the results"    
## [1] "after transform class(data)" "data.frame"                 
## [3] "dim(data)"                   "147"                        
## [5] "93"                          "NAs"                        
## [7] "0"                          
## [1] "applying transform 3, misc_fe to train"
## [1] "after transform class(data)" "data.frame"                 
## [3] "dim(data)"                   "147"                        
## [5] "100"                         "NAs"                        
## [7] "0"                          
## [1] "applying transform 4, population_fe to train"
## [1] "after transform class(data)" "data.frame"                 
## [3] "dim(data)"                   "147"                        
## [5] "126"                         "NAs"                        
## [7] "0"                          
## [1] "applying transform 5, building_materials_fe to train"
## [1] "after transform class(data)" "data.frame"                 
## [3] "dim(data)"                   "147"                        
## [5] "135"                         "NAs"                        
## [7] "0"                          
## [1] "applying transform 6, building_years_fe to train"
## [1] "after transform class(data)" "data.frame"                 
## [3] "dim(data)"                   "147"                        
## [5] "140"                         "NAs"                        
## [7] "0"
raion_data_fe <- merge(raion_data_fe, dr, by='sub_area')
raion_data_fe$district <- factor(raion_data_fe$district)

save(raion_data_fe, file='clean_20170614_raion_data_fe.Rdata')
write.csv(raion_data_fe, file='clean_20170614_raion_data_fe.csv', row.names=FALSE)

Chippy’s lat, log locations

I had spent several days trying to guess locations from the data without using any shape files, by simultaneously estimating the location of each property and each train station, metro station and bus station. Distance from the kremlin is also known, so I put the Kremlin as my origin and modeled properties as just an angle theta from the Kremlin, eliminating half of the proprty coordinates to estimate. This seemed attractive as a pure-math-no-external-data-at-all approach.

In simulation (see kmts.R) and some testing (locations.Rmd), this ran a long time, but did not look like it would reliably converge to a correct solution (modulo rotation and reflection.)

On the forums, Chippy (Nigel Carpenter) opted to use shapefile information and go from the distances to known-location streets, a much less computationally intensive approach.

And he posted his lat, lon coordinates, in addition to his script.

Thanks, Chippy!

Unfortunately, the ‘bad address’ problem presumaly impacts the lat/long cooordinates Chippy posted, so re-do his code here to run on the updated data.

It takes a while (~6 hours), so check to see if results already exist before running anything.

# create raion-specific raw files; save those back to input.
if (!file.exists('../input/Zjuzino_lat_lon.csv')) {
    source('chippy_ll_solver.R')
    chippy_ll_solver(overall_data)
}

# assemble raion-level input files into a clean form with each ID
if (!file.exists('clean_20170614_lat_lon.csv') |
    !file.exists('clean_20170614_lat_lon.Rdata')) {
    # assemble raion-level lat and lon into one file.
    lst_subarea <- unique(overall_data$sub_area)
    lst_subarea <- lst_subarea[order(lst_subarea)] # sort
    lat_lon <- NULL
    for (sa in lst_subarea) {
        lat_lon <- rbind(lat_lon,
              read.csv(paste0('../input/', sa, "_lat_lon.csv"), stringsAsFactors=FALSE),
              stringsAsFactors=FALSE)
    }
    # save location counts for map below.
    location_counts <- lat_lon
    write.csv(location_counts, 'clean_20170614_lat_lon_counts.csv', row.names=FALSE)
    save(location_counts, file='clean_20170614_lat_lon_counts.Rdata')
    
    # merge lat_lon with the row IDs.
    overall_data$key <- with(overall_data, paste(mkad_km, ttk_km, sadovoe_km, sub_area, sep=":"))
    lat_lon <- merge(just_cols(overall_data, c(id_col, 'timestamp', 'key')),
                     just_cols(lat_lon, c('key', 'lat', 'lon', 'tolerance_m')), 
                     by='key', all.x=TRUE)
    
    overall_data$key <- NULL
    lat_lon <- lat_lon[order(just_col(lat_lon, id_col)),]
    write.csv(lat_lon, 'clean_20170614_lat_lon.csv', row.names=FALSE)
    save(lat_lon, file='clean_20170614_lat_lon.Rdata')
}

# merge lat, lon into overall_data
load('clean_20170614_lat_lon.Rdata')
overall_data <- merge(overall_data, just_cols(lat_lon, c(id_col, 'lat', 'lon', 'tolerance_m')))

Check map

Look at counts.

library(leaflet)
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.2-5, (SVN revision 648)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 1.10.1, released 2013/08/26
##  Path to GDAL shared files: /usr/share/gdal/1.10
##  Loaded PROJ.4 runtime: Rel. 4.8.0, 6 March 2012, [PJ_VERSION: 480]
##  Path to PROJ.4 shared files: (autodetected)
##  Linking to sp version: 1.2-4
library(rjson)

load('clean_20170614_lat_lon_counts.Rdata')

print(summary(unique(location_counts$count)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0    24.0    50.0   102.6   118.0   978.0
location_counts_ecdf_x <- unique(location_counts$count)
location_counts_ecdf_x <- location_counts_ecdf_x[order(location_counts_ecdf_x)]
location_counts_ecdf_y <- ecdf(location_counts$count)(location_counts_ecdf_x)
qplot(location_counts_ecdf_x, location_counts_ecdf_y) + ggtitle('ECDF of properties at exact same location')

print(just_cols(subset(location_counts, count > 250), c('count', 'sub_area', 'lat', 'lon', 'tolerance_m')))
##       count                sub_area      lat      lon tolerance_m
## 6858    255                  Mitino 55.84512 37.36375           5
## 7315    260       Nagatinskij Zaton 55.68522 37.69607           5
## 7553    978              Nekrasovka 55.68410 37.92835           5
## 9119    366  Poselenie Desjonovskoe 55.52464 37.37443           5
## 9230    719    Poselenie Moskovskij 55.60371 37.34452          25
## 9466    364     Poselenie Sosenskoe 55.57427 37.49955          25
## 9467    255     Poselenie Sosenskoe 55.57085 37.47434           5
## 9607    340    Poselenie Vnukovskoe 55.63269 37.32573           5
## 9608    331    Poselenie Vnukovskoe 55.63357 37.33098          10
## 9738    285 Poselenie Voskresenskoe 55.52565 37.50405          25
## 9739    260 Poselenie Voskresenskoe 55.52664 37.50568          25
## 11520   297                Solncevo 55.65362 37.40218           5
## 13277   298       Zapadnoe Degunino 55.88029 37.49646          25
pal <- colorNumeric(palette = "RdYlGn", domain = c(0,1000), reverse= TRUE)

shp_moscow_adm <- rgdal::readOGR(dsn = "../input/sberbankmoscowroads", layer = "moscow_adm", verbose=FALSE)
# Make data frame with centroids. I'm sure there is a better way to do this.
raion_centroids <- do.call(rbind.data.frame, lapply(1:146, function(i) { 
    c(coordinates(rgeos::gCentroid(shp_moscow_adm[i,])), name=as.character(shp_moscow_adm[i,]$RAION)) 
    }))
names(raion_centroids) <- c('x','y','name')
raion_centroids$x <- as.numeric(as.character(raion_centroids$x))
raion_centroids$y <- as.numeric(as.character(raion_centroids$y))
raion_centroids$name <- as.character(raion_centroids$na)

# make a GeoJSON form of the shapefile if we don't have one already.
if(!file.exists('shp_moscow_adm.GeoJSON')) {
    rgdal::writeOGR(shp_moscow_adm, dsn='shp_moscow_adm.GeoJSON', driver='GeoJSON', layer=1)
}
gj <- rjson::fromJSON(file='shp_moscow_adm.GeoJSON')
gj_names <- sapply(gj$features, function(f) {f$properties$RAION})
r <- leaflet::leaflet(data=location_counts) %>%
    leaflet::addGeoJSON(gj) %>%
    leaflet::addLabelOnlyMarkers(lng=raion_centroids$x, lat=raion_centroids$y, 
                        label=raion_centroids$name, 
                        labelOptions=labelOptions(
                            noHide=TRUE,
                            textOnly=TRUE)) %>%
    leaflet::addCircles(lat=~lat, lng=~lon, col=~pal(count), fill=TRUE, weight=10, label=~paste0("Count: ", count)) %>%
    leaflet::addLegend("topright", pal=pal, values=~count, title = "Property count", opacity=0.8)

r  # Print the map

Property-specific feature clean up

Looking at the properties, there are many features that are location-specific, like all the distances, and the “neighborhood” information about nearly cafes et al. There are also many raion-specific features, common across the whole raion.

This leaves fairly few features that are specific to the property itself, and it’s good to look at those:

full_sq and life_sq

Have life sq < 5 or full sq < 5 is probably an error. Remove to impute later. Also have full sq < life sq, usually by a lot. Guess that this is a coding error and full is accidentally recorded as the extra.

qplot(life_sq, full_sq, data=overall_data)+scale_x_log10()+scale_y_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 7559 rows containing missing values (geom_point).

overall_data$life_sq[overall_data$life_sq < 5] <- NA
overall_data$full_sq[overall_data$full_sq < 5] <- NA


idx <- overall_data$full_sq < overall_data$life_sq
idx <- !is.na(idx) & idx
overall_data$full_sq[idx] <- overall_data$life_sq[idx] + overall_data$full_sq[idx]

floor and max_floor

Have max floor < floor in 2136 cases. Assume floor is more accurate than max floor.

qplot(floor, max_floor, data=overall_data)
## Warning: Removed 9572 rows containing missing values (geom_point).

idx <- with(overall_data, !is.na(max_floor) & !is.na(floor) & max_floor < floor)
overall_data$max_floor[idx] <- overall_data$floor[idx]

Material.

Have only two points, one test and one train, of material==3. Drop.

qplot(material, data=overall_data)

idx <- with(overall_data, material == 3)
overall_data$material[idx] <- NA

Build year

Build year spans a huge range, from 0 to 20052009. Assume numbers before 1860 are bad, as is 4965. Convert 20092005 into 2007 as a guess.

One of the discussion pages says that properties are sometimes pre-sold, so allow that.

print(summary(overall_data$build_year))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##        0     1966     1980     2717     2006 20050000    14654
print(xtabs(~subset(overall_data, build_year < 1860 | build_year > 2017)$build_year))
## subset(overall_data, build_year < 1860 | build_year > 2017)$build_year
##        0        1        2        3       20       71      215     1691 
##      899      555        1        2        1        1        2        1 
##     2018     2019     4965 20052009 
##       31        5        1        1
qplot(build_year, data=overall_data)+scale_x_continuous(limits=c(1860, 2020))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 16118 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).

overall_data$build_year[overall_data$build_year < 1860] <- NA
overall_data$build_year[overall_data$build_year == 20052009] <- 2007 # guess
overall_data$build_year[overall_data$build_year == 4965] <- NA

num rooms

Less than 1 room is probably incorrect.

print(summary(overall_data$num_room))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   1.000   2.000   1.901   2.000  19.000    9572
qplot(num_room, data=overall_data)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9572 rows containing non-finite values (stat_bin).

overall_data$num_room[overall_data$num_room < 1] <- NA

kitchen sq

Have some kitchen sq that look like build years. Use those to guess a build year. Have many that look the same size (or bigger than) the life or full sq. Clear those.

print(summary(overall_data$kitch_sq))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##    0.000    1.000    6.000    6.544    9.000 2014.000     9572
qplot(kitch_sq, data=overall_data)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 9572 rows containing non-finite values (stat_bin).

idx <- with(overall_data, kitch_sq > 1000 & is.na(build_year))
idx <- !is.na(idx) & idx
overall_data$build_year[idx] <- overall_data$kitch_sq[idx]
idx <- with(overall_data, life_sq & kitch_sq >= life_sq)
idx <- !is.na(idx) & idx
overall_data$kitch_sq[idx] <- NA
idx <- with(overall_data, full_sq & kitch_sq >= full_sq)
idx <- !is.na(idx) & idx
overall_data$kitch_sq[idx] <- NA

State

We have one state of 33. That’s probably a typo for 3, so replace 33 with 3 and recreate the factor to drop the unused level.

plot(overall_data$state)

overall_data$state[overall_data$state == '33'] <- '3'
overall_data$state <- factor(overall_data$state)

Product Type

plot(overall_data$product_type)

Property specific feature imputation

# create a 'dispostion' column with at least 'train' and 'submission' levels.
overall_data$disposition <- NA
overall_data$disposition[!is.na(overall_data$price_doc)] <- 'train'
overall_data$disposition[is.na(overall_data$price_doc)] <- 'submission'


pipeline <- c(drop_nzv=function(train_data) {
    fe_drop_nzv_cols_factory(train_data, keep_cols=c(predict_col, id_col))
})
pipeline <- c(pipeline, impute_property_features=function(train_data) {
    # impute property features here. Use those features, plus the lat and lon
    property_cols <- c('timestamp', raw_property_features, 'lat', 'lon')
    train_property <- just_cols(train_data, c(id_col, property_cols))
    print("train for imputing property features with columns....")
    print(names(train_property))
    print(dim(train_property))
          
    imputer <- na_impute_factory(
        train_property, predict_col=predict_col,
        drop_col='disposition',
        impute_on_just_na_cols=FALSE)
    function(data) {
        data_property <- just_cols(data, c(id_col, property_cols))
        print("apply imputing to property features with columns....")
        print(names(data_property))
        print(dim(data_property))
        data_property <- imputer(data_property)
        data <- cbind(sans_cols(data, property_cols), data_property)
        return (data)
    }
})

Property specific feature engineering

  • add a sold-month column for monthly/seasonal trends
  • add a sold-year column for year progression
  • add a sold year month column for a finer grained progression
  • Add a ‘age at sale’ column, which can be negative if property pre-sold

  • Add a ‘floors above’ column with the number of floors above to
  • Add a ‘floor percentage’ column

library(lubridate)
pipeline <- c(pipeline, add_date_features=function(train_data) {
    function(data) {
        data$sold_year <- with(data, lubridate::year(timestamp))
        data$sold_month <- with(data, lubridate::month(timestamp))
        data$sold_ym <- with(data, paste(lubridate::year(timestamp), lubridate::month(timestamp), sep='-'))
        data$age_at_sale <- with(data, build_year-lubridate::year(timestamp))
        return (data)
    }
})

pipeline <- c(pipeline, add_floors_features=function(train_data) {
    function(data) {
        data$floors_above <- with(data, max_floor - floor)
        data$floor_pct <- with(data, floor / max_floor * 100.0)
        return(data)
    }   
})

General scaling

Scaling is not important in a tree based model, so omit (defer?)

if (FALSE) {
    pipeline <- c(pipeline, scale=function(train_data) {
        scale_preprocessor_factory(train_data, predict_col=predict_col, 
                                   drop_cols=c(predict_col, 'id', 'disposition'))
    })
}

Apply transformations

# fix the data
overall_data_fe <- transform_data(pipeline,
               subset(overall_data, disposition=='train'),
               subset(overall_data, disposition!='skip'),
               verbose=TRUE,
               predict_col=predict_col
)[[2]]
## [1] "applying transform 1, drop_nzv to train"
## [1] "after transform class(data)" "data.frame"                 
## [3] "dim(data)"                   "30471"                      
## [5] "274"                         "NAs"                        
## [7] "265340"                     
## [1] "applying transform 1, drop_nzv to other data 1"
## [1] "after transform class(other_df)" "list"                           
## [3] "dim(other_df)"                   "NAs"                            
## [5] "0"                              
## [1] "applying transform 2, impute_property_features to train"
## [1] "train for imputing property features with columns...."
##  [1] "id"           "timestamp"    "full_sq"      "life_sq"     
##  [5] "floor"        "max_floor"    "material"     "build_year"  
##  [9] "num_room"     "kitch_sq"     "state"        "product_type"
## [13] "lat"          "lon"         
## [1] 30471    14
##  [1] "imputing on"  "timestamp"    "full_sq"      "life_sq"     
##  [5] "floor"        "max_floor"    "material"     "build_year"  
##  [9] "num_room"     "kitch_sq"     "state"        "product_type"
## [13] "lat"          "lon"         
## [1] "imputer is null?" "FALSE"           
## [1] "imputer class" "preProcess"   
## [1] "there are "            "73466"                 "NAs in the train data"
## [1] "apply imputing to property features with columns...."
##  [1] "id"           "timestamp"    "full_sq"      "life_sq"     
##  [5] "floor"        "max_floor"    "material"     "build_year"  
##  [9] "num_room"     "kitch_sq"     "state"        "product_type"
## [13] "lat"          "lon"         
## [1] 30471    14
##  [1] "in imputation, predict_data has cols"
##  [2] "timestamp"                           
##  [3] "full_sq"                             
##  [4] "life_sq"                             
##  [5] "floor"                               
##  [6] "max_floor"                           
##  [7] "material"                            
##  [8] "build_year"                          
##  [9] "num_room"                            
## [10] "kitch_sq"                            
## [11] "state"                               
## [12] "product_type"                        
## [13] "lat"                                 
## [14] "lon"                                 
## [1] 30471    13
## [1] "there are "              "73466"                  
## [3] "NAs in the predict data"
## [1] "done predicting. Have " "23132"                 
## [3] "NAs in the results"    
## [1] "after transform class(data)" "data.frame"                 
## [3] "dim(data)"                   "30471"                      
## [5] "275"                         "NAs"                        
## [7] "215006"                     
## [1] "applying transform 2, impute_property_features to other data 1"
## [1] "apply imputing to property features with columns...."
##  [1] "id"           "timestamp"    "full_sq"      "life_sq"     
##  [5] "floor"        "max_floor"    "material"     "build_year"  
##  [9] "num_room"     "kitch_sq"     "state"        "product_type"
## [13] "lat"          "lon"         
## [1] 38133    14
##  [1] "in imputation, predict_data has cols"
##  [2] "timestamp"                           
##  [3] "full_sq"                             
##  [4] "life_sq"                             
##  [5] "floor"                               
##  [6] "max_floor"                           
##  [7] "material"                            
##  [8] "build_year"                          
##  [9] "num_room"                            
## [10] "kitch_sq"                            
## [11] "state"                               
## [12] "product_type"                        
## [13] "lat"                                 
## [14] "lon"                                 
## [1] 38133    13
## [1] "there are "              "77343"                  
## [3] "NAs in the predict data"
## [1] "done predicting. Have " "23860"                 
## [3] "NAs in the results"    
## [1] "after transform class(other_df)" "list"                           
## [3] "dim(other_df)"                   "NAs"                            
## [5] "0"                              
## [1] "applying transform 3, add_date_features to train"
## [1] "after transform class(data)" "data.frame"                 
## [3] "dim(data)"                   "30471"                      
## [5] "279"                         "NAs"                        
## [7] "215006"                     
## [1] "applying transform 3, add_date_features to other data 1"
## [1] "after transform class(other_df)" "list"                           
## [3] "dim(other_df)"                   "NAs"                            
## [5] "0"                              
## [1] "applying transform 4, add_floors_features to train"
## [1] "after transform class(data)" "data.frame"                 
## [3] "dim(data)"                   "30471"                      
## [5] "281"                         "NAs"                        
## [7] "215006"                     
## [1] "applying transform 4, add_floors_features to other data 1"
## [1] "after transform class(other_df)" "list"                           
## [3] "dim(other_df)"                   "NAs"                            
## [5] "0"

Merge macro and raion data

overall_data_fe <- merge(sans_cols(overall_data_fe, setdiff(names(raion_data_fe), 'sub_area')), 
                         raion_data_fe, by='sub_area')
# reconfirm timestamps are compatible
macro_preproc$timestamp <- as.Date(macro_preproc$timestamp)
overall_data_fe$timestamp <- as.Date(overall_data_fe$timestamp)
overall_data_fe <- merge(overall_data_fe, macro_preproc, by='timestamp')

save(overall_data_fe, file='clean_20170614_overall_data_fe.Rdata')
write.csv(overall_data_fe, file='clean_20170614_overall_data_fe.csv', row.names=FALSE)